pkgs <- c("here", "tidyverse", "RColorBrewer", "viridis", "mgcv", "sdmTMB", "patchwork", "tidylog", "gratia", "readxl")
# minpack.lm needed if using nlsLM()
if (length(setdiff(pkgs, rownames(installed.packages()))) > 0) {
install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
}
invisible(lapply(pkgs, library, character.only = T))
# Continuous colors
options(ggplot2.continuous.colour = "viridis")
# Not on CRAN !
# devtools::install_github("seananderson/ggsidekick")
library(ggsidekick)
theme_set(theme_sleek())
# Set relative path
home <- here::here()Length-at-age data preparation and modelling
Load packages
Read data
Read and join fish data and temperature data.
# Clean data
df <- read_excel(paste0(home, "/data/DruksiaiFish_2022march.xls")) |> dplyr::select(-"...15")
df <- df |>
rename(
year = Year,
age = Age,
area = Zone,
species = Species,
sex = Sex,
month = Month,
data_source = Data_source
) |>
mutate(
stage = NA,
stage = ifelse(year < 1982, "a", stage),
stage = ifelse(year >= 1982 & year <= 2003, "b", stage),
stage = ifelse(year >= 2004, "c", stage)
)
df2 <- df |>
mutate(
area = case_match(area,
c("šalta z.", "šalta z. ") ~ "cold",
c("šilta z.", "šilta z. ") ~ "warm",
c("37 kv.", "4 kv.", "49 kv.") ~ "unclear",
.default = area
)
)
# Add temperatures
temp_df <- read_csv(file = paste0(home, "/output/merged_temp_covar_data.csv")) |>
filter(.width == 0.95) |> # this just because rows are duplicate for each width! doesnt matter sinse we use median instead (ignore that the column is called mean; it's calculated with median_qi from tidybayes)
dplyr::select(mean_lst, year) |>
rename(lst = mean_lst)
df <- left_join(df, temp_df, by = "year")
df <- df |>
mutate(lst = ifelse(year < min(temp_df$year), mean(filter(temp_df, year < 1982)$lst), lst),
year = as.numeric(year))
xtabs(~ species + year, data = df)
#> year
#> species 1965 1978 1979 1980 1981 1982 1983 1984 1985 1986
#> Abramis brama 0 0 0 77 396 233 199 171 509 30
#> Alburnus alburnus 0 0 0 0 0 0 0 0 232 37
#> Blicca bjoerkna 0 0 0 4 0 0 0 48 131 0
#> Carassius carassius 0 0 0 0 0 0 0 0 0 0
#> Carassius gibelio 0 0 0 0 0 0 0 0 6 0
#> Coregonus albula 99 0 0 0 0 0 0 40 7 135
#> Cyprinus carpio 0 0 0 0 0 0 0 0 0 0
#> Cyprinus cyprio 0 0 0 0 0 0 0 0 0 0
#> Esox lucius 0 48 0 188 86 9 20 0 37 18
#> Gymnocephalus cernua 0 0 0 0 0 0 0 0 101 0
#> Leuciscus idus 0 0 0 0 8 0 0 0 8 0
#> Osmerus eperlanus 0 0 0 0 0 0 0 0 129 0
#> Perca fluviatilis 0 24 2 97 118 57 32 0 194 0
#> Rutilus rutilus 0 38 107 251 220 374 255 494 1014 97
#> Sardinius erythrophthalmus 0 0 0 0 1 9 0 0 10 0
#> Tinca tinca 0 0 0 0 0 0 0 0 7 0
#> year
#> species 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
#> Abramis brama 4 112 13 0 621 110 1 0 7 319
#> Alburnus alburnus 0 0 28 0 234 43 42 0 77 77
#> Blicca bjoerkna 0 0 0 0 196 59 0 0 5 336
#> Carassius carassius 0 0 0 0 14 0 0 0 0 0
#> Carassius gibelio 0 0 0 0 22 1 0 0 0 0
#> Coregonus albula 17 10 0 0 158 103 95 0 2 72
#> Cyprinus carpio 0 0 0 0 0 0 0 0 0 0
#> Cyprinus cyprio 0 0 0 0 1 0 0 0 0 0
#> Esox lucius 2 6 1 0 92 11 4 0 20 0
#> Gymnocephalus cernua 0 0 0 0 61 23 0 14 80 78
#> Leuciscus idus 0 0 0 0 10 1 0 0 0 0
#> Osmerus eperlanus 0 0 0 0 2 0 0 0 0 0
#> Perca fluviatilis 0 21 141 0 374 70 26 0 562 574
#> Rutilus rutilus 368 188 844 44 930 504 442 71 39 489
#> Sardinius erythrophthalmus 0 0 0 0 157 12 0 25 6 386
#> Tinca tinca 0 0 0 0 49 1 0 0 0 0
#> year
#> species 1997 1998 1999 2005 2006 2007 2008 2009 2010 2011
#> Abramis brama 163 133 154 45 13 42 25 56 37 40
#> Alburnus alburnus 97 91 135 0 0 0 0 0 0 0
#> Blicca bjoerkna 274 232 157 9 0 0 0 0 0 0
#> Carassius carassius 0 0 0 0 0 0 0 0 0 0
#> Carassius gibelio 0 0 1 0 0 0 0 0 0 1
#> Coregonus albula 64 17 50 4 20 56 23 55 8 3
#> Cyprinus carpio 0 0 0 0 1 0 0 0 0 0
#> Cyprinus cyprio 0 0 0 0 0 0 0 0 0 0
#> Esox lucius 0 1 6 1 5 4 6 13 38 25
#> Gymnocephalus cernua 44 49 74 0 0 1 0 0 0 0
#> Leuciscus idus 0 0 0 0 0 0 0 0 0 0
#> Osmerus eperlanus 0 0 1 0 0 0 0 0 0 0
#> Perca fluviatilis 293 354 252 39 30 49 80 114 167 132
#> Rutilus rutilus 373 398 298 54 48 73 67 99 147 116
#> Sardinius erythrophthalmus 313 227 188 0 0 2 1 0 8 6
#> Tinca tinca 0 2 3 3 12 16 8 8 38 27
#> year
#> species 2012 2014 2015 2016 2017 2018 2019 2020 2021
#> Abramis brama 41 29 41 54 54 31 45 33 56
#> Alburnus alburnus 24 0 2 2 0 0 3 0 8
#> Blicca bjoerkna 2 0 0 6 0 0 0 2 5
#> Carassius carassius 2 0 0 2 0 0 0 0 1
#> Carassius gibelio 1 2 0 0 1 0 0 0 1
#> Coregonus albula 57 28 21 88 31 36 26 13 16
#> Cyprinus carpio 0 0 0 0 0 0 0 0 0
#> Cyprinus cyprio 0 0 0 0 0 0 0 0 0
#> Esox lucius 25 8 13 18 7 6 23 8 7
#> Gymnocephalus cernua 0 0 0 0 0 0 0 12 3
#> Leuciscus idus 0 0 0 0 0 0 0 0 0
#> Osmerus eperlanus 0 0 8 18 0 2 1 7 4
#> Perca fluviatilis 53 62 23 87 76 78 108 32 39
#> Rutilus rutilus 70 53 18 108 61 6 48 0 57
#> Sardinius erythrophthalmus 14 1 0 12 0 2 0 6 29
#> Tinca tinca 38 11 6 36 22 40 30 3 42
length(which(is.na(df$L == T)))
#> [1] 1673
length(which(is.na(df$l == T)))
#> [1] 283
# Rename species to common names
unique(df$species)
#> [1] "Coregonus albula" "Rutilus rutilus"
#> [3] "Esox lucius" "Perca fluviatilis"
#> [5] "Abramis brama" "Blicca bjoerkna"
#> [7] "Leuciscus idus" "Sardinius erythrophthalmus"
#> [9] "Alburnus alburnus" "Gymnocephalus cernua"
#> [11] "Osmerus eperlanus" "Carassius gibelio"
#> [13] "Tinca tinca" "Carassius carassius"
#> [15] "Cyprinus cyprio" "Cyprinus carpio"
df <- df |>
filter(species %in% c(
"Abramis brama", "Alburnus alburnus", "Blicca bjoerkna", "Gymnocephalus cernua",
"Gymnocephalus cernua", "Sardinius erythrophthalmus", "Tinca tinca",
"Osmerus eperlanus", "Perca fluviatilis", "Coregonus albula",
"Esox lucius", "Rutilus rutilus"
)) |>
mutate(
species = fct_recode(species,
"Bream" = "Abramis brama",
"Bleak" = "Alburnus alburnus",
"Silver bream" = "Blicca bjoerkna",
"Ruffe" = "Gymnocephalus cernua",
"Rudde" = "Sardinius erythrophthalmus",
"Tench" = "Tinca tinca",
"Smelt" = "Osmerus eperlanus",
"Perch" = "Perca fluviatilis",
"Vendace" = "Coregonus albula",
"Pike" = "Esox lucius",
"Roach" = "Rutilus rutilus"
)
) |>
drop_na(l)
# Filter ages
df |>
filter(species == "Roach") |>
ggplot(aes(l)) +
geom_histogram() +
facet_wrap(~year, scales = "free_y")
# Some odd observations in roach - likely to be L and not l. For now we remove them
df <- df |>
filter(
!(species == "Roach" & month == 5 & year == "1980"),
!(species == "Roach" & month == 8 & year == "1980" & age < 7),
!(species == "Roach" & age == 4 & l < 2)
)
# Explore length vs age
ggplot(df, aes(age, l, color = data_source)) +
facet_wrap(~species, scales = "free", ncol = 5) +
geom_point(alpha = 0.2) +
theme(legend.position = "bottom")
# Some data entry errors in bream as well, filter these! (a 1 yr old bream is not 37 cm!)
df <- df |>
mutate(keep = ifelse(species == "Bream" & age < 2 & l > 30, "N", "Y")) |>
filter(keep == "Y") |>
dplyr::select(-keep)
# And in vendece...
df <- df |>
mutate(keep = ifelse(species == "Vendace" & age >= 3 & l < 12, "N", "Y")) |>
filter(keep == "Y") |>
dplyr::select(-keep)
# Replace NA with new level
df <- df |> mutate(data_source = replace_na(data_source, "missing_data_source"))To get covariates that represent the average temperatures during the lifetime, we’ll here add lagged variables. This code can certainly be written in a better way but it does the job …
# Add various lagged temperature metrics (average lifetime temperature etc). We don't have temperatures for the oldest cohorts, so we give them the average of the stage a period
df <- df |> mutate(cohort = year - age)
#> mutate: new variable 'cohort' (double) with 62 unique values and <1% NA
min(df$cohort, na.rm = TRUE)
#> [1] 1961
min(temp_df$year)
#> [1] 1978
min(temp_df$year) - min(df$cohort, na.rm = TRUE)
#> [1] 17
# Need to extend it 17 years
years_to_add <- seq(from = min(df$cohort, na.rm = TRUE), to = min(temp_df$year - 1))
temp_df_temp <- data.frame(
year = years_to_add,
lst = rep(mean(filter(temp_df, year < 1982)$lst), length(years_to_add)),
stage = rep("a", length(years_to_add))
)
#> filter: removed 41 rows (91%), 4 rows remaining
temp_df2 <- bind_rows(temp_df_temp, temp_df) |>
mutate(
stage = NA,
stage = ifelse(year < 1982, "a", stage),
stage = ifelse(year >= 1982 & year <= 2003, "b", stage),
stage = ifelse(year >= 2004, "c", stage)
) |>
arrange(year)
#> mutate: changed 45 values (73%) of 'stage' (45 fewer NA)
# Ok, now, based on the cohort and age, we need to average certain columns and match to the length-at-age data...
# Lag variables equal to the oldest fish in the data
temp_df2 <- temp_df2 |>
mutate(
lst_t_minus_1 = lag(lst, n = 1),
lst_t_minus_2 = lag(lst, n = 2),
lst_t_minus_3 = lag(lst, n = 3),
lst_t_minus_4 = lag(lst, n = 4),
lst_t_minus_5 = lag(lst, n = 5),
lst_t_minus_6 = lag(lst, n = 6),
lst_t_minus_7 = lag(lst, n = 7),
lst_t_minus_8 = lag(lst, n = 8),
lst_t_minus_9 = lag(lst, n = 9),
lst_t_minus_10 = lag(lst, n = 10),
lst_t_minus_11 = lag(lst, n = 11),
lst_t_minus_12 = lag(lst, n = 12),
lst_t_minus_13 = lag(lst, n = 13),
lst_t_minus_14 = lag(lst, n = 14),
lst_t_minus_15 = lag(lst, n = 15),
lst_t_minus_16 = lag(lst, n = 16),
lst_t_minus_17 = lag(lst, n = 17),
lst_t_minus_18 = lag(lst, n = 18),
lst_t_minus_19 = lag(lst, n = 19),
lst_t_minus_20 = lag(lst, n = 20),
lst_t_minus_21 = lag(lst, n = 21)
)
#> mutate: new variable 'lst_t_minus_1' (double) with 46 unique values and 2% NA
#> new variable 'lst_t_minus_2' (double) with 45 unique values and 3% NA
#> new variable 'lst_t_minus_3' (double) with 44 unique values and 5% NA
#> new variable 'lst_t_minus_4' (double) with 43 unique values and 6% NA
#> new variable 'lst_t_minus_5' (double) with 42 unique values and 8% NA
#> new variable 'lst_t_minus_6' (double) with 41 unique values and 10% NA
#> new variable 'lst_t_minus_7' (double) with 40 unique values and 11% NA
#> new variable 'lst_t_minus_8' (double) with 39 unique values and 13% NA
#> new variable 'lst_t_minus_9' (double) with 38 unique values and 15% NA
#> new variable 'lst_t_minus_10' (double) with 37 unique values and 16% NA
#> new variable 'lst_t_minus_11' (double) with 36 unique values and 18% NA
#> new variable 'lst_t_minus_12' (double) with 35 unique values and 19% NA
#> new variable 'lst_t_minus_13' (double) with 34 unique values and 21% NA
#> new variable 'lst_t_minus_14' (double) with 33 unique values and 23% NA
#> new variable 'lst_t_minus_15' (double) with 32 unique values and 24% NA
#> new variable 'lst_t_minus_16' (double) with 31 unique values and 26% NA
#> new variable 'lst_t_minus_17' (double) with 30 unique values and 27% NA
#> new variable 'lst_t_minus_18' (double) with 29 unique values and 29% NA
#> new variable 'lst_t_minus_19' (double) with 28 unique values and 31% NA
#> new variable 'lst_t_minus_20' (double) with 27 unique values and 32% NA
#> new variable 'lst_t_minus_21' (double) with 26 unique values and 34% NA
# Add in the same years that I extended the temperature data with in the length so that I can do a left_join to get in the lagged variables
years_to_add2 <- temp_df2 |>
dplyr::select(year) |>
filter(year < min(df$year))
#> filter: removed 45 rows (73%), 17 rows remaining
df2 <- bind_rows(years_to_add2, df)
# Join in the lagged temperature variables to the length data. Before I do that though, it will be much easier indexing further down if temp (lst) is the last column...
temp_col <- df2 |> dplyr::select(lst)
df2 <- df2 |> dplyr::select(-lst)
df2$lst <- temp_col$lst
colnames(df2)
#> [1] "year" "Date" "month" "Day" "species"
#> [6] "code" "L" "l" "Q" "age"
#> [11] "sex" "Scales" "area" "data_source" "stage"
#> [16] "cohort" "lst"
# Now join
df3 <- left_join(df2, temp_df2)
#> Joining with `by = join_by(year, stage, lst)`
#> left_join: added 21 columns (lst_t_minus_1, lst_t_minus_2, lst_t_minus_3, lst_t_minus_4, lst_t_minus_5, …)
#> > rows only in x 17
#> > rows only in y ( 24)
#> > matched rows 24,043
#> > ========
#> > rows total 24,060
# Check an example: age 5 fish in 1991
df3 |>
filter(age == 5 & year == 1991) |>
dplyr::select(
year, cohort, age, l, species, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
) |>
arrange(year) |>
head(1)
#> filter: removed 23,486 rows (98%), 574 rows remaining
#> year cohort age l species lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1991 1986 5 17 Bream 18.56636 19.91319 20.51592 20.29426
#> lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1 16.20713 19.20568 18.40932
temp_df |>
filter(year < 1992 & year > 1985) |>
as.data.frame() |>
arrange(desc(year))
#> filter: removed 39 rows (87%), 6 rows remaining
#> lst year
#> 1 18.56636 1991
#> 2 19.91319 1990
#> 3 20.51592 1989
#> 4 20.29426 1988
#> 5 16.20713 1987
#> 6 19.20568 1986
# Check an example: age 10 fish in 1980
min(temp_df$year)
#> [1] 1978
df3 |>
filter(age == 10 & year == 1980) |>
dplyr::select(
year, cohort, age, l, species, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
) |>
arrange(year) |>
head(1)
#> filter: removed 24,017 rows (>99%), 43 rows remaining
#> year cohort age l species lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1980 1970 10 31 Perch 16.19834 17.3763 17.47408 17.09762
#> lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1 17.09762 17.09762 17.09762
temp_df |>
filter(year < 1981 & year > 1969) |>
as.data.frame() |>
arrange(desc(year))
#> filter: removed 42 rows (93%), 3 rows remaining
#> lst year
#> 1 16.19834 1980
#> 2 17.37630 1979
#> 3 17.47408 1978
temp_df2 |>
filter(year < 1981 & year > 1969) |>
as.data.frame() |>
dplyr::select(lst, year) |>
arrange(desc(year))
#> filter: removed 51 rows (82%), 11 rows remaining
#> lst year
#> 1 16.19834 1980
#> 2 17.37630 1979
#> 3 17.47408 1978
#> 4 17.09762 1977
#> 5 17.09762 1976
#> 6 17.09762 1975
#> 7 17.09762 1974
#> 8 17.09762 1973
#> 9 17.09762 1972
#> 10 17.09762 1971
#> 11 17.09762 1970
df3 |>
filter(year %in% c(1986, 1987, 1988, 1989, 1990, 1991)) |>
dplyr::select(
year, cohort, age, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
) |>
arrange(year) |>
distinct(year, .keep_all = TRUE)
#> filter: removed 19,177 rows (80%), 4,883 rows remaining
#> distinct: removed 4,877 rows (>99%), 6 rows remaining
#> year cohort age lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1986 1966 20 19.20568 18.40932 17.63245 17.62352
#> 2 1987 1976 11 16.20713 19.20568 18.40932 17.63245
#> 3 1988 1981 7 20.29426 16.20713 19.20568 18.40932
#> 4 1989 1974 15 20.51592 20.29426 16.20713 19.20568
#> 5 1990 1984 6 19.91319 20.51592 20.29426 16.20713
#> 6 1991 1991 0 18.56636 19.91319 20.51592 20.29426
#> lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1 17.46301 17.34176 16.19834
#> 2 17.62352 17.46301 17.34176
#> 3 17.63245 17.62352 17.46301
#> 4 18.40932 17.63245 17.62352
#> 5 19.20568 18.40932 17.63245
#> 6 16.20713 19.20568 18.40932
# All good. What remains now is to take the average across the columns of lagged temperatures. The number of columns to calculate on depends on the age.
df4 <- df3 |> drop_na(age)
#> drop_na: removed 55 rows (<1%), 24,005 rows remaining
# Fill in columns with NA before looping
df4$lifetime_avg_temp <- NA
df4$birth_temp <- NA
df4$first_two_yrs_temp <- NA
str(df4)
#> 'data.frame': 24005 obs. of 41 variables:
#> $ year : num 1978 1978 1978 1978 1978 ...
#> $ Date : chr "1978.06.01" "1978.06.01" "1978.06.01" "1978.06.01" ...
#> $ month : num 6 6 6 6 6 6 6 6 6 6 ...
#> $ Day : num NA NA NA NA NA NA NA NA NA NA ...
#> $ species : Factor w/ 11 levels "Bream","Bleak",..: 9 9 9 9 9 5 5 9 9 5 ...
#> $ code : chr "34/14" "35/15" "36/16" "37/17" ...
#> $ L : chr NA NA NA NA ...
#> $ l : num 17 16 15 18 18 32 36 18 17 42 ...
#> $ Q : num 100 85 65 125 110 310 640 115 115 845 ...
#> $ age : num 7 6 6 8 7 4 5 8 8 5 ...
#> $ sex : chr "9" "9" "1" "9" ...
#> $ Scales : chr NA NA NA NA ...
#> $ area : chr "?" "?" "?" "?" ...
#> $ data_source : chr "D5" "D5" "D5" "D5" ...
#> $ stage : chr "a" "a" "a" "a" ...
#> $ cohort : num 1971 1972 1972 1970 1971 ...
#> $ lst : num 17.5 17.5 17.5 17.5 17.5 ...
#> $ lst_t_minus_1 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_2 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_3 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_4 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_5 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_6 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_7 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_8 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_9 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_10 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_11 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_12 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_13 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_14 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_15 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_16 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_17 : num 17.1 17.1 17.1 17.1 17.1 ...
#> $ lst_t_minus_18 : num NA NA NA NA NA NA NA NA NA NA ...
#> $ lst_t_minus_19 : num NA NA NA NA NA NA NA NA NA NA ...
#> $ lst_t_minus_20 : num NA NA NA NA NA NA NA NA NA NA ...
#> $ lst_t_minus_21 : num NA NA NA NA NA NA NA NA NA NA ...
#> $ lifetime_avg_temp : logi NA NA NA NA NA NA ...
#> $ birth_temp : logi NA NA NA NA NA NA ...
#> $ first_two_yrs_temp: logi NA NA NA NA NA NA ...
# Loop through all rows and calculate the average lifetime temperature by averaging across as many columns the fish is old
for (i in 1:nrow(df4)) {
# temperatures start with temp on col #17; colnames(df4); df4 |> select(17) |> head()
# Lifetime
df4$lifetime_avg_temp[i] <- ifelse(df4$age[i] == 0,
df4[i, 17],
rowMeans(df4[i, 17:(17 + (df4$age[i]))])
)
# Birth temp
df4$birth_temp[i] <-
df4[i, (17 + (df4$age[i]))]
}
# Check it worked by plotting the temperatures in the growth data against manual calculations
# Here, if age == 0, lst and birth temp and average lifetime birth temp should be the same
df4 |>
filter(age == 0) |>
ggplot(aes(lst, lifetime_avg_temp)) +
geom_point() +
geom_abline()
#> filter: removed 23,779 rows (99%), 226 rows remaining
df4 |>
filter(age == 0) |>
ggplot(aes(lst, birth_temp)) +
geom_point() +
geom_abline()
#> filter: removed 23,779 rows (99%), 226 rows remaining
# Looks alright for age zeroes...
# For age 1 fish, Test age one lifetime avg
df4 |>
filter(age == 1) |>
mutate(lifetime_avg_temp2 = (lst + lst_t_minus_1) / 2) |> # manually calculate the average of current temperature and lagged 1 year temperature
ggplot(aes(lifetime_avg_temp2, lifetime_avg_temp)) +
geom_point() +
geom_abline()
#> filter: removed 23,410 rows (98%), 595 rows remaining
#> mutate: new variable 'lifetime_avg_temp2' (double) with 24 unique values and 0% NA
# Age 2
df4 |>
filter(age == 2) |>
mutate(lifetime_avg_temp2 = (lst + lst_t_minus_1 + lst_t_minus_2) / 3) |> # manually calculate the average of current temperature and lagged 1 year and lagged 2 year temperatures
ggplot(aes(lifetime_avg_temp2, lifetime_avg_temp)) +
geom_point() +
geom_abline()
#> filter: removed 22,768 rows (95%), 1,237 rows remaining
#> mutate: new variable 'lifetime_avg_temp2' (double) with 35 unique values and 0% NA
# Age 3 birth temp
df4 |>
filter(age == 3) |>
mutate(birth_temp2 = lst_t_minus_3) |>
ggplot(aes(birth_temp2, birth_temp)) +
geom_point() +
geom_abline()
#> filter: removed 21,796 rows (91%), 2,209 rows remaining
#> mutate: new variable 'birth_temp2' (double) with 34 unique values and 0% NA
# Another test... all cohorts before 1978 should have the same temperature
df4 |>
filter(cohort < 1978) |>
distinct(birth_temp)
#> filter: removed 19,435 rows (81%), 4,570 rows remaining
#> distinct: removed 4,569 rows (>99%), one row remaining
#> birth_temp
#> 1 17.09762
tt <- df4 |> filter(is.na(birth_temp))
#> filter: removed all rows (100%)
tt <- df4 |> filter(is.na(lifetime_avg_temp))
#> filter: removed all rows (100%)Explore data
# if starting from here load df4
# load(file = paste0(home, "/data/druksiai_beforeFiltering.RData"))
# Sample size
df4 |>
group_by(year, species, age) |>
summarise(n = n()) |>
ggplot(aes(year, n, fill = factor(age))) +
geom_bar(stat = "identity") +
facet_wrap(~species, scales = "free_y")
#> group_by: 3 grouping variables (year, species, age)
#> summarise: now 1,819 rows and 4 columns, 2 group variables remaining (year, species)
# Histogram of lengths
ggplot(df4, aes(l)) +
geom_histogram() +
facet_wrap(~species, scales = "free")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.Trim data
# Trim data somewhat, and include datasource as a random effect
df5 <- df4 |>
ungroup() |>
group_by(species, age) |>
mutate(
sd_l = sd(l, na.rm = TRUE),
mean_l = mean(l, na.rm = TRUE)
) |>
filter(is.finite(sd_l)) |>
ungroup() |>
mutate(
keep = ifelse(l >= mean_l + 4 * sd_l, "N", "Y"),
keep = ifelse(l <= mean_l - 4 * sd_l, "N", keep)
)
#> ungroup: no grouping variables
#> group_by: 2 grouping variables (species, age)
#> mutate (grouped): new variable 'sd_l' (double) with 138 unique values and <1% NA
#> new variable 'mean_l' (double) with 148 unique values and 0% NA
#> filter (grouped): removed 9 rows (<1%), 23,996 rows remaining
#> ungroup: no grouping variables
#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA
df5 <- df5 |>
tidylog::filter(!(species == "Silver bream" & l < 5 & age > 10))
#> filter: removed one row (<1%), 23,995 rows remaining
ggplot(df5, aes(age, l, color = keep)) +
geom_jitter(alpha = 3/4) +
facet_wrap(~species, scales = "free", ncol = 3)
df5 <- df5 |>
mutate(
data_source2 = case_when(
data_source == "žvynų knygutės" ~ "scales",
data_source == "missing_data_source" ~ "missing_data_source",
data_source %in% c("seni kompų hardai", "senų kompų hardai") ~ "digitised_old_data",
TRUE ~ "journal"
)
)
#> mutate: new variable 'data_source2' (character) with 4 unique values and 0% NA
ggplot(df5, aes(year, fill = data_source2)) +
geom_bar(stat = "count") +
facet_wrap(~species)
# Fix some variables
df5 <- df5 |>
drop_na(age) |>
drop_na(lifetime_avg_temp) |>
drop_na(month) |>
drop_na(l) |>
dplyr::select(-data_source) |>
rename(data_source = data_source2) |>
mutate(
f_month = as.factor(month),
f_age = as.factor(age),
f_data_source = as.factor(data_source)
)
#> drop_na: no rows removed
#> drop_na: no rows removed
#> drop_na: removed 4,491 rows (19%), 19,504 rows remaining
#> drop_na: no rows removed
#> rename: renamed one variable (data_source)
#> mutate: new variable 'f_month' (factor) with 12 unique values and 0% NA
#> new variable 'f_age' (factor) with 22 unique values and 0% NA
#> new variable 'f_data_source' (factor) with 4 unique values and 0% NA
# vendace - 2-3 years;
# bream - 5-7 years;
# perch - 2-3 years;
# roach - 3-5 years;
# pike - 3 years.
# Bleak (Alburnus) – 2-3 years;
# Silverbream – 3-4 years
# Ruffe – 2 years
# Smelt – 2 years
# Rudde (Scardinius) – 3-4 years
# Tench (Tinca) – 3-5 years
# Plot age distributions to figure out which constitutes "old" fish
# Add maturation age by species
df5 <- df5 |>
mutate(
life_stage = "Adult",
life_stage = ifelse(species == "Vendace" & age %in% c(2, 3), "Maturation age", life_stage),
life_stage = ifelse(species == "Vendace" & age < 2, "Juvenile", life_stage),
life_stage = ifelse(species == "Bream" & age %in% c(5, 6, 7), "Maturation age", life_stage),
life_stage = ifelse(species == "Bream" & age < 5, "Juvenile", life_stage),
life_stage = ifelse(species == "Perch" & age %in% c(2, 3), "Maturation age", life_stage),
life_stage = ifelse(species == "Perch" & age < 2, "Juvenile", life_stage),
life_stage = ifelse(species == "Roach" & age %in% c(3, 4, 5), "Maturation age", life_stage),
life_stage = ifelse(species == "Roach" & age < 3, "Juvenile", life_stage),
life_stage = ifelse(species == "Pike" & age %in% c(3), "Maturation age", life_stage),
life_stage = ifelse(species == "Pike" & age < 3, "Juvenile", life_stage),
life_stage = ifelse(species == "Bleak" & age %in% c(2, 3), "Maturation age", life_stage),
life_stage = ifelse(species == "Bleak" & age < 2, "Juvenile", life_stage),
life_stage = ifelse(species == "Silver bream" & age %in% c(3, 4), "Maturation age", life_stage),
life_stage = ifelse(species == "Silver bream" & age < 3, "Juvenile", life_stage),
life_stage = ifelse(species == "Ruffe" & age %in% c(2), "Maturation age", life_stage),
life_stage = ifelse(species == "Ruffe" & age < 2, "Juvenile", life_stage),
life_stage = ifelse(species == "Smelt" & age %in% c(2), "Maturation age", life_stage),
life_stage = ifelse(species == "Smelt" & age < 2, "Juvenile", life_stage),
life_stage = ifelse(species == "Rudde" & age %in% c(3, 4), "Maturation age", life_stage),
life_stage = ifelse(species == "Rudde" & age < 3, "Juvenile", life_stage),
life_stage = ifelse(species == "Tench" & age %in% c(3, 4, 5), "Maturation age", life_stage),
life_stage = ifelse(species == "Tench" & age < 3, "Juvenile", life_stage)
) |>
mutate(break_age = max(age) / 2) |>
mutate(life_stage = ifelse(age > break_age & life_stage == "Adult", "Old adult", life_stage))
#> mutate: new variable 'life_stage' (character) with 3 unique values and 0% NA
#> mutate: new variable 'break_age' (double) with one unique value and 0% NA
#> mutate: changed 3,303 values (17%) of 'life_stage' (0 new NA)
ggplot(df5, aes(age, fill = life_stage)) +
geom_histogram() +
facet_wrap(~species, scales = "free")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.# Final exploration:
# yearly data distribution by species. Note, smelt has too few years and cohorts (short lived), so we will remove it
df5 |>
group_by(species) |>
distinct(cohort, .keep_all = TRUE) |>
ggplot(aes(cohort, species)) +
geom_point()
#> group_by: one grouping variable (species)
#> distinct (grouped): removed 19,066 rows (98%), 438 rows remaining
# smelt has only 12 cohorts, other species >20
df5 |>
group_by(species) |>
summarise(n_cohorts = n_distinct(cohort)) |>
arrange(n_cohorts, desc(n_cohorts))
#> group_by: one grouping variable (species)
#> summarise: now 11 rows and 2 columns, ungrouped
#> # A tibble: 11 × 2
#> species n_cohorts
#> <fct> <int>
#> 1 Smelt 12
#> 2 Ruffe 24
#> 3 Bleak 30
#> 4 Vendace 37
#> 5 Tench 39
#> 6 Silver bream 41
#> 7 Rudde 41
#> 8 Pike 46
#> 9 Bream 54
#> 10 Roach 56
#> 11 Perch 58df6 <- df5 |>
filter(!species == "Smelt") |>
filter(keep == "Y") |>
dplyr::select(
year, species, l, age, f_age, f_month, stage, cohort, lst,
lifetime_avg_temp, birth_temp, data_source, f_data_source, life_stage
)
#> filter: removed 171 rows (1%), 19,333 rows remaining
#> filter: removed 135 rows (1%), 19,198 rows remaining
save(df6, file = paste0(home, "/data/druksiai2.RData"))